'''
NOTE about the files
our files as we read them into this notebook are saved in this hierarchy
data/
├── bycode2018.xlsx
├── index 2018.xlsx
├── coord.csv
│
├── 21/
│ ├── expb.csv
│ ├── expc.csv
├── 22/
│ ├── expb.csv
│ ├── expc.csv
├── 23/
│ ├── expb.csv
│ ├── expc.csv
Where coord.csv is a new csv file we included which is similar to the bycode.xlsx but from 4 years ago,
we needed it since it has exact loocation of each settlement which allowed us to draw them on Israel's map
the csv file will be added with the hw submission
All other files were not changed we used them as they were when we downloaded them.
'''
# Load libraries
import pandas as pd
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
import plotly
import re
import folium
import random
from pyproj import Proj, transform
from scipy.spatial.distance import cdist
import seaborn as sns
from branca.element import Template, MacroElement
plt.style.use('fivethirtyeight')
import sklearn
from sklearn import metrics
from sklearn import mixture
from sklearn.cluster import SpectralClustering, DBSCAN, KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
random.seed(1234)
# Get all csv's
data_path = r'data\\21'
c21 = pd.read_csv(os.path.join(data_path, "expc.csv"), header=0)
b21 = pd.read_csv(os.path.join(data_path, "expb.csv"), header=0)
data_path = r'data\\22'
c22 = pd.read_csv(os.path.join(data_path, "expc.csv"), header=0)
b22 = pd.read_csv(os.path.join(data_path, "expb.csv"), header=0)
data_path = r'data\\23'
c23 = pd.read_csv(os.path.join(data_path, "expc.csv"), header=0)
b23 = pd.read_csv(os.path.join(data_path, "expb.csv"), header=0)
data_path = r'data'
bycode = pd.read_excel(os.path.join(data_path, "bycode2018.xlsx"), header=0)
index = pd.read_excel(os.path.join(data_path, "index 2018.xlsx"))
In each section we need different data based on the requested analysis, to limit the chance of one "changing" the data of another section we used new dataframes for each section so the cleansing part is in methods that we can call upon each time.
# So we wont have to drop it each time
c21 = c21.drop(['Unnamed: 37'], axis=1)
def process_bycode(df):
'''
this method fixes values that are "out of place" like שנת ייסוד = 0 and
fills nulls with average or zeros depending on the column.
'''
df = df.drop(['יהודים ואחרים', 'שנת ייסוד',
'שיוך מטרופוליני', 'תעתיק', 'אשכול רשויות מקומיות', 'קואורדינטות', 'שנה'], axis=1)
# Fill out missing numbers
df["גובה"].fillna("0",inplace = True)
df["דת יישוב"].fillna("0",inplace = True)
'''Population fill missing rows with average of all others'''
df['סך הכל אוכלוסייה 2018'] = df['סך הכל אוכלוסייה 2018'].fillna(np.round(df['סך הכל אוכלוסייה 2018'].mean()))
'''Number of arabs and jews
fill missing data with 80% or population for jews and 20% for arabs.'''
for index,row in df.iterrows():
if np.isnan(row["ערבים"]):
df.at[index,"ערבים"] = round(df.at[index,'סך הכל אוכלוסייה 2018']*0.2)
if np.isnan(row["מזה: יהודים"]):
df.at[index,"מזה: יהודים"] = round(df.at[index,'סך הכל אוכלוסייה 2018']*0.8)
""" השתייכות ארגונית """
df["השתייכות ארגונית"].fillna("19",inplace = True)
""" ועדת תכנון """
df["ועדת תכנון"].fillna("0",inplace = True)
""" צורת יישוב """
for index,row in df.iterrows():
if row["צורת יישוב שוטפת"] in [120,130,140,150,250,160,260,170,270,180,280,190,191,192,193,290]:
# יישוב עירוני
df.at[index,"צורת יישוב שוטפת"] = 100
elif row["צורת יישוב שוטפת"] in [310,320,330,340,440,370,350,450,460]:
# יישוב כפרי
df.at[index,"צורת יישוב שוטפת"] = 200
else: # מקום
df.at[index,"צורת יישוב שוטפת"] = 300
""" מעמד מונציפאלי """
for index,row in df.iterrows():
if row["מעמד מונציפאלי"] in range(1,78):
# מועצה אזורית
df.at[index,"מעמד מונציפאלי"] = 2
df["מעמד מונציפאלי"].fillna("3",inplace = True) #חסר מעמד
return df
def to_int(df):
'''
Given a df convert all columns to int
'''
for col in df.columns:
df[col] = pd.to_numeric(df[col])
return df
def translate_bycode(df):
'''
Given the bycode df this method returns it with column names translated to english
'''
df = df.rename(columns={'סמל ועדה':'Committee symbol', 'שם יישוב':'Settlemnt', 'סמל יישוב':'SettlementID',
'סך הכל אוכלוסייה 2018': 'Population 2018', 'מזה: יהודים': 'Jews', 'ערבים': 'Arabs',
'כשרים': 'Qualified', 'פסולים': 'Disqualified', 'צורת יישוב שוטפת': 'Locality type',
'דת יישוב': 'Religion', 'ועדת תכנון':'Planning Commission', 'מצביעים':'Voters', 'מרחב משטרה':'Police district',
'גובה':'Height', 'נפה':'subdistrict', 'מחוז':'District', 'אזור טבעי': 'Natural area',
'תעתיק':'Translated', 'מעמד מונציפאלי':'Municipal status', 'שיוך מטרופוליני':'Metropolitan affiliation',
'שנת ייסוד':'Founding year', 'השתייכות ארגונית':'Organizational affiliation', 'שנה':'Year',
'שם יישוב באנגלית':'Settlement Eng', 'אשכול רשויות מקומיות':'Cluster Local Authorities'} )
return df
def translate_top_parties(df):
'''
Given the votes df this method returns it with to party names translated to english
'''
df = df.rename(columns={'מחל':'Likude', 'פה':'Blue-white', 'ודעם':'Joint-list',
'שס': 'Shas', 'ג': 'Degel hatora', 'אמת': 'Ha Avoda',
'ל': 'Israel beteno', 'טב': 'New yameen', 'נץ': 'Natz',
'ז': 'Zehoot', 'נ':'N', 'ףז':'Internet pirates', 'קץ':'Kets'} )
return df
def get_english_settlements(df):
'''
Given a df replaces "שם יישוב" column with "שם יישוב באנגלית" from bycode
'''
# Change the city names from hebrew to english.
english_df = bycode[['סמל יישוב', 'שם יישוב באנגלית']]
df = df.join(english_df.set_index(['סמל יישוב'], verify_integrity=True), on=['סמל ישוב'], how='left')
df = df.drop('שם ישוב', axis=1)
df = df.rename(columns={'שם יישוב באנגלית': 'settlement'})
return df
def drop_parties(df,threshold):
'''
Given a df and a threshold remove all parties that don't reach
and return the new df.
'''
to_drop = []
for column in df.columns[7:]:
if df[column].sum() < threshold:
to_drop.append(column)
return df.drop(to_drop,axis=1)
def name_districts(df):
for index,row in df.iterrows():
if int(row['district']) == 1:
df.at[index,'district'] = 'מחוז ירושלים'
elif int(row['district']) == 2:
df.at[index,'district'] = 'מחוז הצפון'
elif int(row['district']) == 3 :
df.at[index,'district'] = 'מחוז חיפה'
elif int(row['district']) == 4:
df.at[index,'district'] = 'מחוז המרכז'
elif int(row['district']) == 5:
df.at[index,'district'] = 'מחוז תל אביב'
elif int(row['district']) == 6:
df.at[index,'district'] = 'מחוז הדרום'
elif int(row['district']) == 7:
df.at[index,'district'] = 'אזור יהודה ושומרון'
return df
# Check if the information we have is full or missing values
l = []
for col in bycode.columns[2:]:
l.append([col, bycode[col].isnull().sum()])
number_of_nulls = pd.DataFrame(l, columns = ["Column", "Number of NaN values"])
del l
number_of_nulls
As we can see above some features like שיוך מטרופוליני and אשכול רשויות מקומיות have half or more of their values as NaN - undefined which means we can't count on them to help our learning methods thats why in each section when the features are choosen these ones never make the cut.
Note : another feature with high NaN is for example is השתיכות ארגונית but by the index we know missing values there should be '19' and thats we do in processing bycode function.
# call on processing the bycode fuction which removes some columns and cleans others
cleaned = process_bycode(bycode)
features = cleaned.drop(['שם יישוב','שם יישוב באנגלית'], axis=1)
# to make join easier rename this one
features=features.rename(columns={'סמל יישוב': 'סמל ישוב'})
After cleaning and removing irrelevant columns from the bycode df we save the results in 'features' which includes:
'''
Get last election votes and translate settlement and party names
'''
c21_viz = c21.copy()
c21_viz = get_english_settlements(c21_viz)
c21_viz = translate_top_parties(c21_viz)
# Plot "The biggest Israeli settlements"
settl_top = c21_viz.sum(axis=1).reset_index()
settl_top.columns = ['settlement', 'values']
settl_top = settl_top.sort_values('values', ascending=False)
settl_top['percent'] = round(settl_top['values'] / settl_top['values'].sum() * 100, 2)
settl_top['settlement'] = c21_viz['settlement']
fig = plt.figure(figsize=(15, 6))
ax = fig.add_subplot(111)
sns.barplot(y='settlement', x='values', data=settl_top.head(15), palette=sns.color_palette("Set2", 15))
plt.title('The biggest settlements in Israel (top-15)')
plt.ylabel('Settlements')
plt.xlabel('% of all votes')
plt.show()
# Top-10 parties
party_top = c21_viz.drop(['סמל ועדה','סמל ישוב','בזב','מצביעים','פסולים','כשרים'], axis=1)
party_top = party_top.sum().reset_index()
party_top.columns = ['party', 'values']
party_top = party_top.sort_values('values', ascending=False)
party_top['percent'] = round(party_top['values']/party_top['values'].sum()*100, 2)
# Plot "The most popular israili parties in 2015"
fig = plt.figure(figsize=(11, 5))
ax = fig.add_subplot(111)
sns.barplot(y='party', x='percent', data=party_top.head(10), palette=sns.color_palette("Set2"))
plt.title('The top 10 most popular Israeli parties in first the election')
plt.ylabel('Parties')
plt.xlabel('% of all votes')
plt.show()
Since after the 8th party, the number of votes per party gets lower and lower we assume these small parties do not affect our analysis and we'll be removing them before starting to run our models.
First get all features in one dataframe, we already have 'features' which has all features from bycode, join it with ['סמל ישוב', 'מצביעים', 'פסולים', 'כשרים'] from c21 remove unwanted features like name and year then build correlation matrix and heatmap.
''' DATA '''
c21_viz = c21_viz[['סמל ישוב', 'מצביעים', 'פסולים', 'כשרים']]
c21_viz = pd.merge(c21_viz, features, how='left', left_on=['סמל ישוב'], right_on = ['סמל ישוב'])
c21_viz = c21_viz.drop(['סמל ישוב','מרחב משטרה'],axis=1)
c21_viz = translate_bycode(c21_viz)
c21_viz = to_int(c21_viz)
''' Correlation Matrix '''
fig = px.scatter_matrix(c21_viz, color="Voters", title="Scatter matrix of features",width=950,
height=1000, dimensions=['Voters', 'Disqualified', 'Qualified','Natural area', 'District', 'Religion',
'Municipal status'])
fig.show()
The only visible colleration here is the number of Disqualified and Qualified votes getting higher with number of votes, which is an expected colleration.
The religion feature shows higher votes and qualified votes in religioin 1 = jews but that is not a feature since most israel's population are jews... Same with מעמד מוניציפילי we can clearly see higher voting rates in status = 0 which is עירייה but we cant infer a correlation since it also has higher population.
''' Correlation Heatmap '''
corr = c21_viz.corr()
plt.figure(figsize=(18, 18))
sns.heatmap(corr, vmax=.8, linewidths=0.01, square=True, annot=True, linecolor="white")
plt.title('Correlation between features');
plt.show()
# delete variables
del c21_viz
del corr
The research question we'll be trying to answer in this section is: Is Israel's district distribution correlated to which parties each settlement votes for ? in other words can we expect Nazareth's votes to be to certian parties just for the fact that it's in Israel's Northern District?
To do that we'll fit the data of pure votes using two clustering methods, after we have K clusters we'll view their districts and infer if we indeed found a correlation or not.
We'll be working on data from first elections, so we'll copy that df and process the votes by:
#Get first election's data
clus_c21 = c21.copy()
# Drop parties with small number of voters.
relevant_parties = drop_parties(clus_c21, 1000)
# Translate settlment name hebrew -> english
relevant_parties = get_english_settlements(relevant_parties)
clus_data = relevant_parties.set_index('settlement')
# Remove none party columns
clus_data = clus_data.drop(['כשרים', 'פסולים', 'מצביעים', 'בזב', 'סמל ישוב', 'סמל ועדה'], axis=1)
clus_data = to_int(clus_data)
# Normalize the votes to smaller numbers [0,1]
relative_votes = clus_data.div(clus_data.sum(axis=1), axis=0)
''' for i,r in df_original.iterrows():
for c in columns:
x = r[c]/r['valid votes']
df.at[i , c]=x'''
#Converge them into (x,y) dots to help plotting
xy = sklearnPCA(n_components=2).fit_transform(relative_votes)
#To be able to plot 3D scatterplot converge to (x,y,z) tuples aswell
xyz = sklearnPCA(n_components=3).fit_transform(relative_votes)
#Save in dataframes
threeD = pd.DataFrame(xyz)
pca_df = pd.DataFrame(xy)
# save locations and sizes alone, needed for a graph after the clustering.
locations = relevant_parties[['settlement', 'סמל ישוב']]
sizes = relevant_parties[['סמל ישוב','מצביעים']]
sizes = sizes.rename(columns={'מצביעים': 'size'})
#Add to (x,y) df to help us later with viz
pca_df['locations'] = locations['settlement']
pca_df['sizes'] = sizes['size']
relative_votes = translate_top_parties(relative_votes)
relative_votes.head(2)
The data we've built for the clustering method is in 'relative_votes' df which contains normalized votes for the following parties:
in addition 'pca_df' has a tuple (x,y) where x is saved under '0' and y under '1' that represents the % of votes each settlement had for each party, in addition this dataframe has two more columns, one for the settlement's name and the other for the total number of voters in it.
Atfer the clustering is done we'll need each settlement's name and number of voters to build a few visualizations, so we add them to our vote tuples.
# Plot the data on a graph to see the nodes before and after clustering.
fig = plotly.subplots.make_subplots(rows=1, cols=2, horizontal_spacing=0.03, specs=[[{"type": "xy"},{"type": "scatter3d"}]])
fig.add_trace(
go.Scatter( x=pca_df[0], y=pca_df[1],marker_symbol='hexagon2', mode="markers+text",
marker=dict(size=12,color='rgba(135, 206, 250, 0.7)', line=dict(width=1, color='DarkSlateGrey'))),
row=1, col=1)
fig.add_trace(
go.Scatter3d(x=threeD[0], y=threeD[1], z=threeD[2],
mode="markers",marker_symbol='circle-open', marker=dict(size=7)),
row=1, col=2)
fig.update_xaxes(title_text="x", row=1, col=1)
fig.update_yaxes(title_text="y", row=1, col=1)
fig.update_layout(height=600, width=1000,
title_text="Plotting data after lowering the it's dimention, once in 2D the other 3D", showlegend=False)
fig.show()
Looking at the scatter plots above we can already think of how the data will be clustered, we'll draw scatter plots after the clustering is done to see if we get what we expected.
distortions = []
K = range(1, 10)
for k in K:
kmeanModel = KMeans(n_clusters=k, max_iter=10000).fit(relative_votes)
#kmeanModel.fit(only_parties)
distortions.append(sum(np.min(cdist(relative_votes, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / relative_votes.shape[0])
plt.figure(figsize=(11, 5))
plt.plot(K, distortions, 'o-', markersize=12)
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.title('Optimal number of clusters')
plt.show()
K=4;
kmeanModel = KMeans(n_clusters=K, max_iter=10000).fit(relative_votes)
print("The parties in each cluster: ")
kmeans_cluster_info = [1]*K
cluster=[1]*K
for k in range(0,K):
cluster[k]=tuple(zip(kmeanModel.cluster_centers_[k],relative_votes.columns))
cluster[k]=sorted(cluster[k],reverse=True)
info = 'Cluster' + str(k+1) + ':'
print ('Cluster '+str(k+1)+':', end=' ')
for i in range(0,4):
info = info + cluster[k][i][1] + "-" + str(round(100*cluster[k][i][0])/100) + ', '
print(cluster[k][i][1]+" - "+str(round(100*cluster[k][i][0])/100), end=', ')
kmeans_cluster_info[k] = info
print()
First we'll create a df 'kmeans_graph' which includes all needed data to build relevant charts:
# Get districts and change them from numbers to names
districts = features[['סמל ישוב', 'מחוז' ]]
districts = districts.rename(columns={'מחוז': 'district'})
districts.district = districts.district.astype(str)
districts = name_districts(districts)
#Get pca_df and add new features.
kmans_graph = pca_df
#Add predected labels
kmans_graph['label'] = kmeanModel.predict(relative_votes)
#Add district
kmans_graph['dis'] = districts['district']
print('Example')
kmans_graph.head(1)
'''
Given a df build a scatter plot to view clusters
t - is the cluster method to add it in title (kmeans or GMM)
clusterinfo - is a list of clusters information to add it to legend
'''
def scatter_clusters(df, t, clusterinfo):
traces = []
i=0
for label in sorted(df.label.unique()):
traces.append(
go.Scatter3d(x=df[df.label == label][0], y=df[df.label == label][1], z=df[df.label == label]['dis'],
text=df[df.label == label]['locations'], mode='markers',
hoverinfo='text', name = clusterinfo[i], opacity = 0.8 )),
i=i+1
layout = go.Layout(
width=900, height=700, template='simple_white', title= t + ' Resulting Clusters',
hovermode='closest', xaxis=dict( title = 'x'), yaxis=dict(title = 'y'), legend_orientation="h")
return go.Figure(data=traces, layout=layout)
Explanation: the x axis and y axis of the scatter plot below represent the settlements after we lowered their dimention to 2D, the color represents the cluster number from fitting the data using kmeans method and finally the z axis represents the district of the settlement (this feature was not part of the data we're presenting it this way to see if there's a correlation between votes and districts)
scatter_clusters(kmans_graph, 'Kmeans', kmeans_cluster_info)
In the scatter plot below is similar to the one we did before fitting the data using kmeans, in addition to that here the dot's size reffers to number of voters in that settlement and it's color to the cluster we predict it to be in.
'''
Given a df build a scatter plot with size proportional to number of voters
t - is the cluster method to add it in title (kmeans or GMM)
clusterinfo - is a list of clusters information to add it to legend
'''
def scatter_with_size(df, t, clusterinfo):
traces = []
i=0
for label in sorted(df.label.unique()):
traces.append(go.Scatter(
x=df[df.label == label][0], y=df[df.label == label][1], text=df[df.label == label]['locations'],
mode='markers', hoverinfo='text', name = clusterinfo[i],
marker=dict( size=[np.sqrt(a)/5 for a in (df[df.label == label]['sizes'])], )))
i=i+1
layout = go.Layout(
width=970,
height=750,
template='seaborn',
title= t + ' Resulting Clusters with Number of Votes as Size',
hovermode='closest',
xaxis=dict(
title = 'x', autorange=True, showgrid=True, zeroline=False,
showline=False, ticks='', showticklabels=True),
yaxis=dict( title = 'y', autorange=True, showgrid=True, zeroline=False,
showline=False, ticks='', showticklabels=True),
legend_orientation="h")
return go.Figure(data=traces, layout=layout)
fig = scatter_with_size(kmans_graph, 'Kmeans', kmeans_cluster_info)
py.iplot(fig)
To do that we added a new csv from an election 4 years ago which has Latitude and Longitude of 98% of the settlements we have we read that cvs and save it in 'bycode_new'. Now we build a new dataframe 'map_df' which contains:
# Get new bycode and keep only relevant data -> settlementCode, X_GEO, Y_GEO
bycode_new = pd.read_csv(r'data/coord.csv', encoding='utf-8')
coord = bycode_new[['SettlementCode', 'X_GEO', 'Y_GEO']].copy()
# Join our bycode with the coordinates
joined = pd.merge(bycode, coord, how='left', left_on=['סמל יישוב'], right_on = ['SettlementCode'])
joined = pd.merge(joined, sizes, how='right', left_on=['סמל יישוב'], right_on=['סמל ישוב'],)
# Remove nulls
joined = joined[joined['X_GEO'].notna()]
joined = joined[joined['Y_GEO'].notna()]
map_df = kmans_graph.copy()
map_df['lat'] = joined.Y_GEO
map_df['lng'] = joined.X_GEO
map_df['locations'] = joined['שם יישוב באנגלית']
'''
there are 53 settlements that dont have name in english, and displaying name in hebrew does not work
(missing UTF-8) so we replaced NaNs in location with Undefined.
'''
map_df['locations'] = map_df['locations'].replace(np.nan, 'Undefined', regex=True)
map_df['sizes'] = joined['size']
map_df = map_df[map_df['lat'].notna()]
map_df = map_df[map_df['lng'].notna()]
map_df = map_df.drop([0,1], axis=1)
print('An example :')
map_df.head(1)
# Create Legend for the map
'''
NOTE:
The legend includes % of voted to partied which was hardcoded into the map since making it manual didnt work...
But with it it's easier to understand the clusters.
The parties in each cluster:
'''
template = """
{% macro html(this, kwargs) %}
<!doctype html> <html lang="en"> <head> <meta charset="utf-8"> </head>
<body>
<div id='maplegend' style='position: absolute; z-index:9999; border:2px solid grey; background-color:rgba(255, 255, 255, 0.8);
border-radius:6px; padding: 10px; font-size:14px; right: 20px; bottom: 20px;'>
<div style='margin-bottom: 5px; font-weight: bold;'>Legend</div>
<div> <ul style='padding: 0; float: left; list-style: none;'>
<li><span style='background:#0173b2;opacity:0.7; float: left; height: 16px; width: 30px; margin-right: 5px;'>
</span>Cluster1: Blue-white-0.58, HaAvoda-0.19, Likude-0.15, Newyameen-0.03</li>
<li><span style='background:#cc78bc;opacity:0.7; float: left; height: 16px; width: 30px; margin-right: 5px;'>
</span>Cluster2: Likude-0.5, Blue-white-0.18, Shas-0.11, Newyameen-0.06</li>
<li><span style='background:#029e73;opacity:0.7; float: left; height: 16px; width: 30px; margin-right: 5px;'>
</span>Cluster3: Joint-list-0.92, Blue-white-0.03, Ha Avoda-0.03, Likude-0.01</li>
<li><span style='background:#d55e00;opacity:0.7; float: left; height: 16px; width: 30px; margin-right: 5px;'>
</span>Cluster4: Newyameen-0.55, Likude-0.26, Blue-white-0.06, DegelHatora-0.05</li>
</ul> </div> </div> </body> </html> {% endmacro %}"""
macro = MacroElement()
macro._template = Template(template)
# Create map
m = folium.Map(location=[32.13,34.8],zoom_start=9, tiles="CartoDB Positron")
# Get color pallet
colors = ['#0173b2', '#cc78bc', '#029e73', '#d55e00']
# Add circles for settlements
for row in map_df.iterrows():
folium.Circle(
location= (row[1].lat, row[1].lng),
radius = 0.04*row[1].sizes,
popup = '<b>' + str(row[1].locations),
color = colors[row[1].label],
fill = True,
fill_color = colors[row[1].label],
).add_to(m)
#Add legend
m.get_root().add_child(macro)
#Show map
m
The needed data for GMM clustering is the same data we prepared in the previous part which is in 'relative_votes' So, now we need to find the best number of clusters to use, we'll do that using BIC score ( found it online here ) where the choosen number of clusters is the one that after it the score becomes almost constant.
evluate_bic = {}
n_components_range = range(2, 10)
cv_types = ['spherical', 'tied', 'diag', 'full']
# checking what are the best parameters for GMM
for cv_type in cv_types:
bic = []
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = mixture.GaussianMixture(n_components=n_components,covariance_type=cv_type)
gmm.fit(relative_votes)
bic.append(gmm.bic(relative_votes))
labels = gmm.predict(relative_votes)
evluate_bic[cv_type] = bic
# Plot the results on a bar chart
n_components_range = [2,3,4,5,6,7,8,9]
fig = go.Figure()
# ploting metrics
for key in evluate_bic:
fig.add_trace(go.Bar( x = n_components_range, y = evluate_bic[key], name=key ))
# layout
layout = go.Layout(barmode='group', template='seaborn', title= 'Gradient of BIC Scores',
xaxis=dict(title = 'Number of clusters'), yaxis=dict( title = 'Score', range=[0, -120000]))
fig.update_layout(layout)
fig.show()
#The choosen number of clusters is 6
K = 5
gmmModel = GaussianMixture(n_components=K).fit(relative_votes)
labels = gmmModel.predict(relative_votes)
First we'll create a df 'gmm_graph' which includes all needed data to build relevant charts:
gmm_graph = pd.DataFrame(xy)
gmm_graph['label'] = labels
gmm_graph['locations'] = locations['settlement']
gmm_graph['sizes'] = sizes['size']
gmm_graph['dis'] = districts['district']
same explanation as the one in kmeans.
gmm_info = [1]*K
for i in range(0,K):
gmm_info[i] = 'Cluster ' + str(i+1)
scatter_clusters(gmm_graph, 'GMM', gmm_info)
Same scatter plot as in kmeans.
fig = scatter_with_size(gmm_graph, 'GMM', gmm_info)
py.iplot(fig)
We can see a certain change in some districts, nothing too major but for example מחוז תל אביב has votes to both right and left parties, but not too right (doesnt vote to cluster 3 which is mainly ג and שס). and in אזור יהודה we can clearly see the lack of voting to left parties, other districts are filled with both jewish settlements and arab settlements which even outs the clusters in it, with certain change depending on number of arab vs. jews settlements in it.
All in all, other than changes between mainly jews district and mixed districts both both clustering methods didn't find any deeper correlation.
We can compare the resulted clusters using measures provided by sklearn:
kmeanslabels = kmeanModel.labels_
gmmlabels = gmmModel.predict(relative_votes)
#the score is -1 for incorrect clustering and +1 for highly dense clustering, around 0 indicate overlapping clusters
print("kmeans silhouette", round(metrics.silhouette_score(xy, kmeanslabels, metric='euclidean'),5))
print("gmm silhouette", round(metrics.silhouette_score(xy, gmmlabels, metric='euclidean'),5))
# higher score -> model with better defined clusters
print("\nkmeans calinski harabasz", round(metrics.calinski_harabasz_score(xy, kmeanslabels),5))
print("gmm calinski harabasz", round(metrics.calinski_harabasz_score(xy, gmmlabels),5))
# Values closer to zero indicate a better partition.
print("\nkmeans davies bouldin", round(metrics.davies_bouldin_score(xy, kmeanslabels),5))
print("gmm davies bouldin", round(metrics.davies_bouldin_score(xy, gmmlabels),5))
In all of the measures it looks like kmeans clusters are "better" than gmms clusters, but that can only be the cause of gmms weird shaped clusters and not because it did a worst job.
The research question in this section is What is the number of eligible votes in the last elections in Haifa, Ayelet Hashachar, Eilat, Sakhnin and Katzrin? To answer this question out model was trained on settlement features and the number of eligible votes in the previous elections and the result would be the number of eligible votes in the "current" elections (once for 2nd elections and once for 3rd).
The used settlement features are detailed below, after the training is done we checked which features are the most important ones hoping they'll give us an insight or a colleration.
Copy the df of the 3 elections and the bycode table. rearrange the data to train, test and eval.
rf_c21 = c21.copy()
rf_c22 = c22.copy()
rf_c23 = c23.copy()
rf_bycode = bycode.copy()
# Use the method we created earlier to fill NaN values with mean or zeros
rf_bycode = process_bycode(rf_bycode)
rows_to_predict = ['חיפה', 'איילת השחר', 'אילת', "סחנין", 'קצרין']
# ID's of settlements for predictions
keep = [77, 2600, 4000, 7500, 4100]
# Features to drop
to_drop = ['סמל ישוב', 'שם יישוב', 'שם יישוב באנגלית',
'valid_third', 'מעמד מונציפאלי']
first_ele = rf_c21[['סמל ישוב', 'כשרים']]
first_ele = first_ele.rename(columns={'כשרים': 'valid_first'})
second_ele = rf_c22[['סמל ישוב', 'כשרים']]
second_ele = second_ele.rename(columns={'כשרים': 'valid_second'})
third_ele = rf_c23[['סמל ישוב', 'כשרים']]
third_ele = third_ele.rename(columns={'כשרים': 'valid_third'})
valid_votes = pd.merge(first_ele, second_ele, on = ['סמל ישוב'])
valid_votes = pd.merge(valid_votes, third_ele, on = ['סמל ישוב'])
rf_bycode = rf_bycode.rename(columns={'סמל יישוב': 'סמל ישוב'})
all_data = pd.merge(valid_votes, rf_bycode, on = ['סמל ישוב'])
all_data = all_data.fillna(0)
''' Test Data '''
test = all_data[all_data['סמל ישוב'].isin(keep)]
ytest = test['valid_third']
xtest = test.drop(to_drop, axis=1)
xtest = xtest.drop(['valid_first'], axis=1)
xtest = xtest.rename(columns={'valid_second': 'valid_prev'})
''' Train Data '''
train1 = all_data[~all_data['סמל ישוב'].isin(keep)]
ytrain1 = train1[['valid_third']]
ytrain1 = ytrain1.rename(columns={'valid_third': 'actual'})
xtrain1 = train1.drop(to_drop, axis=1)
xtrain1 = xtrain1.drop(['valid_first'], axis=1)
xtrain1 = xtrain1.rename(columns={'valid_second': 'valid_prev'})
train2 = all_data.copy()
ytrain2 = train2[['valid_second']]
ytrain2 = ytrain2.rename(columns={'valid_second': 'actual'})
xtrain2 = train2.drop(to_drop, axis=1)
xtrain2 = xtrain2.drop(['valid_second'], axis=1)
xtrain2 = xtrain2.rename(columns={'valid_first': 'valid_prev'})
ytrain = pd.concat([ytrain1, ytrain2])
ytrain = ytrain['actual']
xtrain = pd.concat([xtrain1, xtrain2])
After the cleaning and trimming the dataframes we decided to keep the following features:
The test data are xtest and ytest where, xtest contains the features above ('valid_prev' features contrains the number of legal votes from the 2nd elections) and ytest has the number of legal votes, both for the following settlements from the 3rd election.
The train data are xtrain and ytrain can be devided into two parts:
To ensure finding the best model we create a grid search object and fit the data to all possible combinations.
param_grid = {
'criterion':['mse'],
'n_estimators': [150, 180, 200, 220], # The number of trees in the forest.
'max_depth': [None, 40, 60, 80], # The maximum depth of the tree.
'max_features': ['sqrt',4,5], # he number of features to consider when looking for the best split
'min_samples_split': [None, 3, 4], # The minimum number of samples required to split an internal node
'bootstrap': [True] # Whether bootstrap samples are used when building trees.
}
# Create a grid search object
gsRFC = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=-1, cv=5)
# Fit
gsRFC.fit(xtrain, ytrain)
gsRFC.best_params_
Given the best model run predictions and calculate the MSE of the model.
# run prediction
new_best_model = gsRFC.best_estimator_
ypred = new_best_model.predict(xtest)
# calulate MSE
mse = mean_squared_error(ytest, ypred)
print('The model\'s MSE is ' + str(round(mse, 2)))
First we'll create a new dataframe 'rf_graph' with the following columns:
# Create and get relevant data only
rf_graph = rf_c23[rf_c23['שם ישוב'].isin(rows_to_predict)].copy()
rf_graph = rf_graph.drop(['סמל ועדה', 'סמל ישוב', 'בזב', 'מצביעים', 'פסולים',
'אמת', 'ג', 'ודעם', 'ז', 'זך', 'זץ', 'טב', 'י', 'יז', 'ינ', 'יף', 'יק',
'יר', 'כ', 'כן', 'ל', 'מחל', 'נ', 'נז', 'ני', 'נץ', 'נק', 'פה', 'ףז',
'ץ', 'ק', 'קי', 'קך', 'קץ', 'שס', 'Unnamed: 37'], axis=1)
rf_graph['Pred'] = np.round(ypred)
rf_graph = rf_graph.rename(columns={'שם ישוב': 'Settlement', 'כשרים':'Test'})
rf_graph['MSE'] = (rf_graph['Pred']-rf_graph['Test'])**2
rf_graph['logMSE'] = np.round(np.log(rf_graph['MSE']), 2)
#Now we'll display the results in a table; actual number of legal votes, predicted number with MSE and log(MSE)
fig = go.Figure(data=[go.Table(
header=dict(values=['<b>Settlement</b>', '<b>Actual</b>', '<b>Predicted</b>', '<b>MSE</b>' ,'<b>log(MSE)</b>'],
line_color='darkslategray', fill_color='rgb(158,202,225)',
font=dict(size=13), align='center'),
cells=dict(values=[rf_graph['Settlement'], rf_graph['Test'], rf_graph['Pred'], rf_graph['MSE'] ,rf_graph['logMSE']],
line_color='darkslategray', fill_color = ['rgb(222,252,229)','white'],
align = 'center', font = dict(color = 'darkslategray', size = 13) ))
])
fig.update_layout(width=800, height=315, title='Actual number of legal votes vs. predicted number of legal votes with log(MSE)')
fig.show()
Here we display the results in two charts:
#Create the charts
test_trace = go.Bar(name='Test', x=rf_graph['Settlement'], y=rf_graph['Test'], text=rf_graph['Test'], textposition='auto',
marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6)
pred_trace = go.Bar(name='Pred', x=rf_graph['Settlement'], y=rf_graph['Pred'], text=rf_graph['Pred'], textposition='auto',
marker_color='rgb(180,300,200)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
error_trace = go.Bar(name='Error', x=rf_graph['Settlement'], y=rf_graph['logMSE'], text=rf_graph['logMSE'], textposition='auto',
marker_color='rgb(224,127,127)', marker_line_color='rgb(255,255,255)',marker_line_width=1, opacity=0.6)
rf_fig = plotly.subplots.make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)
rf_fig.append_trace(test_trace, 1,1)
rf_fig.append_trace(pred_trace, 1,1)
rf_fig.append_trace(error_trace, 2, 1)
# Update xaxis and yaxis properties
rf_fig.update_xaxes(title_text="Settlement", row=2, col=1)
rf_fig.update_yaxes(title_text="Votes", row=1, col=1)
rf_fig.update_yaxes(title_text="log(MSE)", row=2, col=1)
rf_fig['layout'].update(height=600, width=950, title={'text': "Final Results Comparison", 'y':0.9, 'x':0.5, 'xanchor': 'center','yanchor': 'top'})
rf_fig.show()
#Delete used variables
del rf_c21
del rf_c22
del rf_c23
del rf_bycode
del rf_graph
del xtrain
del ytrain
del xtest
del ytest
Our goal in this section is to predict the number of voters for parties: Likud, the joint list and blue-white in the following settlements: Jerusalem, Bnei Brak, Sakhnin, Carmiel, and Dalit al-Carmel.
ada_c21 = c21.copy()
ada_c22 = c22.copy()
ada_c23 = c23.copy()
ada_bycode = bycode.copy()
rows_to_predict = ['ירושלים', 'בני ברק', "סחנין", 'כרמיאל', 'דאלית אלכרמל']
row_to_test = ['ירושלים', 'בני ברק', "סח'נין", 'כרמיאל', 'דאלית אל-כרמל']
cols_to_drop = ['שם יישוב', 'סמל ישוב', 'שם יישוב באנגלית']
"""Data Cleansing"""
# fill NaN's and other
ada_bycode = process_bycode(ada_bycode)
#merge the first two elections together
frames = [ada_c21, ada_c22]
elections12 = pd.concat(frames, sort=False)
# keep only relevant parties
elections12 = elections12[['סמל ישוב', 'ודעם', 'מחל', 'פה']]
# join elections data with features (bycode table)
df4 = elections12.join(ada_bycode.set_index(['סמל יישוב'], verify_integrity=True),
on=['סמל ישוב'], how='inner')
df4 = df4.fillna(0)
"""test data"""
x_test = df4[df4['שם יישוב'].isin(row_to_test)]
x_test = x_test.drop(cols_to_drop + ['ודעם', 'מחל', 'פה'] , axis=1)
x_test = x_test.drop_duplicates(subset=None, keep='first', inplace=False)
df4 = df4.drop(cols_to_drop, axis=1)
# df4 = df4.drop(labels.index, axis=0)
"""Train data"""
X = df4.drop(['ודעם', 'מחל', 'פה'], axis=1)
"""get labels, fill NAN with 0, assuming city has 0 votes"""
y1 = df4['פה'].fillna(0)
y2 = df4['ודעם'].fillna(0)
y3 = df4['מחל'].fillna(0)
"""get Test Label From 23st ellection unseen data"""
labels = ada_c23[ada_c23['שם ישוב'].isin(rows_to_predict)]
y_test1 = labels['פה']
y_test2 = labels['ודעם']
y_test3 = labels['מחל']
Since we're building 3 independent models we're going to need 3 of each xtrain, ytrain, xtest, ytest
xtrain
Before training each model we're going to run feature selection and select the best features and save them in x1_train, x2_train, x3_train based on the model itself. We're going to choose features from the dataframe we've created above called 'X' which includes the features from the first two election on all settlements, the features (we're explained what these features hold in previous sections):
ytrain
The target train are saved in y1, y2, y3 where y1 is number of votes for Blue-white party in first 2 elections, y2 for the Joint list party and y3 for Likude.
xtest
The x test data includes the same features we listed above, for the following settlements :ירושלים ,בני ברק ,סחנין כרמיאל, דאלית אלכרמל Like before we keep relevant features after feature selection and save them in x1_train, x2_train, x3_train each for a different party.
ytest
The target in test is number of votes in the 3rd election for the settlements listed in xtest, here too we have 3: y_test1, y_test2 and y_test3 each for a different party.
To ensure getting the best results we use a param grid and keep the best model.
# Create gridsearch
ada_paramgrid = {'n_estimators':[50,100,150,200,250],
'learning_rate':[0.001, 0.01, 0.1, 1],
'random_state': [1]}
def select_features(xtrain, ytrain, xtest, partyname):
'''
Given xtrain ytrain xtest this method selects important features based on
adaboost regressor and return the new xtrain and xtest
'''
sfm = SelectFromModel(AdaBoostRegressor())
sfm.fit(xtrain, ytrain)
# Printing the names of the most important features
print('selected features in ' + partyname + ':')
for ind in sfm.get_support(indices=True):
print(xtrain.columns[ind])
return sfm.transform(xtrain), sfm.transform(xtest)
"""Select features then fit the model, given correspoded labels"""
#Blue-white
X1, x1_test = select_features(X, y1, x_test, 'blue-white')
gs1 = GridSearchCV(AdaBoostRegressor(), param_grid = ada_paramgrid, cv=5, scoring = 'neg_mean_absolute_error', n_jobs= -1)
gs1.fit(X1, y1)
ada1 = gs1.best_estimator_
#joint-list
X2, x2_test = select_features(X, y2, x_test, 'joint-list')
gs2 = GridSearchCV(AdaBoostRegressor(), param_grid = ada_paramgrid, cv=5, scoring = 'neg_mean_absolute_error', n_jobs= -1)
gs2.fit(X2, y2)
ada2 = gs2.best_estimator_
#Likude
X3, x3_test = select_features(X, y3, x_test, 'likude')
gs3 = GridSearchCV(AdaBoostRegressor(), param_grid = ada_paramgrid, cv=5, scoring = 'neg_mean_absolute_error', n_jobs= -1)
gs3.fit(X3, y3)
ada3 = gs3.best_estimator_
Given the 3 models, run predictions on each one and calculate it's MSE.
"""Predict Data for Blue White"""
y_pred1 = ada1.predict(x1_test)
mse1 = mean_squared_error(y_test1, y_pred1)
print('MSE of Blue-White model ' + str(round(mse1,2)))
"""Predict Data for Joint List"""
y_pred2 = ada2.predict(x2_test)
mse2 = mean_squared_error(y_test2, y_pred2)
print('MSE of Joint-list model ' + str(round(mse2,2)))
"""Predict Data for Likude"""
y_pred3 = ada3.predict(x3_test)
mse3 = mean_squared_error(y_test3, y_pred3)
print('MSE of Likude model ' + str(round(mse3,2)))
In the first section we create a new dataframe 'ada_graph' which contains
From the votes count for each of the parties we calculated the MSE per settlement, since the MSE results are high and presenting them in a graph doesn't show the diffrence clearly we added another column which is log(MSE) which is the column used in the graph below since it keeps the difference between mse's but with much smaller numbers. THE MSE and log(MSE) and under the following columns in ada_graph:
After we have the data ready we'll build charts to represent it and also print it in a table.
ada_graph = labels.drop(['סמל ועדה', 'סמל ישוב', 'בזב', 'מצביעים', 'פסולים', 'כשרים',
'אמת', 'ג', 'ז', 'זך', 'זץ', 'טב', 'י', 'יז', 'ינ', 'יף', 'יק',
'יר', 'כ', 'כן', 'ל', 'נ', 'נז', 'ני', 'נץ', 'נק','ףז',
'ץ', 'ק', 'קי', 'קך', 'קץ', 'שס', 'Unnamed: 37'], axis=1)
ada_graph = ada_graph.rename(columns={'שם ישוב': 'Settlement', 'ודעם':'Joint list test', 'מחל':'Likude test',
'פה':'Blue white test'})
ada_graph['Blue white pred'] = np.round(y_pred1)
ada_graph['Joint list pred'] = np.round(y_pred2)
ada_graph['Likude pred'] = np.round(y_pred3)
ada_graph['JS_mse'] = (ada_graph['Joint list pred']-ada_graph['Joint list test'])**2
ada_graph['JS_logmse'] = np.log(ada_graph['JS_mse'])
ada_graph['BW_mse'] = (ada_graph['Blue white pred']-ada_graph['Blue white test'])**2
ada_graph['BW_logmse'] = np.log(ada_graph['BW_mse'])
ada_graph['L_mse'] = (ada_graph['Likude pred']-ada_graph['Likude test'])**2
ada_graph['L_logmse'] = np.log(ada_graph['L_mse'])
# Round the log(mse) results to only 2 digits after point.
ada_graph['JS_logmse'] = np.round(ada_graph['JS_logmse'], 2)
ada_graph['L_logmse'] = np.round(ada_graph['L_logmse'], 2)
ada_graph['BW_logmse'] = np.round(ada_graph['BW_logmse'], 2)
Next we've creat 6 charts, 3 of them represent the actual votes vs. predected votes and the other 3 are the log(mse) of those in the first graph.
BW_test = go.Bar(name='Test', x=ada_graph['Settlement'], y=ada_graph['Blue white test'], text=ada_graph['Blue white test'], textposition='auto',
marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6)
BW_pred = go.Bar(name='Pred', x=ada_graph['Settlement'], y=ada_graph['Blue white pred'], text=ada_graph['Blue white pred'], textposition='auto',
marker_color='rgb(180,300,200)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
BW_error = go.Bar(name='Error', x=ada_graph['Settlement'], y=ada_graph['BW_logmse'], text=ada_graph['BW_logmse'], textposition='auto',
marker_color='rgb(224,127,127)', marker_line_color='rgb(255,255,255)',marker_line_width=1, opacity=0.6)
JS_test = go.Bar(name='Test', x=ada_graph['Settlement'], y=ada_graph['Joint list test'], text=ada_graph['Joint list test'], textposition='auto',
marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6, showlegend=False)
JS_pred = go.Bar(name='Pred', x=ada_graph['Settlement'], y=ada_graph['Joint list pred'], text=ada_graph['Joint list pred'], textposition='auto',
marker_color='rgb(180,300,200)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6, showlegend=False)
JS_error = go.Bar(name='Error', x=ada_graph['Settlement'], y=ada_graph['JS_logmse'],text=ada_graph['JS_logmse'], textposition='auto',
marker_color='rgb(224,127,127)', marker_line_color='rgb(255,255,255)',marker_line_width=1, opacity=0.6, showlegend=False)
L_test = go.Bar(name='Test', x=ada_graph['Settlement'], y=ada_graph['Likude test'], text=ada_graph['Likude test'], textposition='auto',
marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6, showlegend=False)
L_pred = go.Bar(name='Pred', x=ada_graph['Settlement'], y=ada_graph['Likude pred'], text=ada_graph['Likude pred'], textposition='auto',
marker_color='rgb(180,300,200)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6, showlegend=False)
L_error = go.Bar(name='Error', x=ada_graph['Settlement'], y=ada_graph['L_logmse'],text=ada_graph['L_logmse'], textposition='auto',
marker_color='rgb(224,127,127)', marker_line_color='rgb(255,255,255)',marker_line_width=1, opacity=0.6, showlegend=False)
fig = plotly.subplots.make_subplots(rows=3, cols=2, vertical_spacing=0.09,
subplot_titles=("Blue-white results","Blue-white MSE", "Joint-list results", "Joint-list MSE", "Likude results", "Likud MSe"))
fig.append_trace(BW_test, 1,1)
fig.append_trace(BW_pred, 1,1)
fig.append_trace(BW_error, 1,2)
fig.append_trace(JS_test, 2,1)
fig.append_trace(JS_pred, 2,1)
fig.append_trace(JS_error, 2,2)
fig.append_trace(L_test, 3,1)
fig.append_trace(L_pred, 3,1)
fig.append_trace(L_error, 3,2)
# Update xaxis and yaxis properties
for x in range(1,4):
fig.update_xaxes(title_text="Settlement", row=x, col=1)
fig.update_yaxes(title_text="Votes", row=x, col=1)
fig.update_xaxes(title_text="Settlement", row=x, col=2)
fig.update_yaxes(title_text="log(MSE)", row=x, col=2)
fig['layout'].update(height=1300, width=1050, title={'text': "Final Results Comparison", 'y':0.98, 'x':0.5, 'xanchor': 'center','yanchor': 'top'},
legend_orientation="h", legend=dict(x=0.0, y=1.06))
fig.show()
And finally the results in a table
#Create the table
fig = go.Figure(data=[go.Table(
columnorder = [1,2,3,4,5,6,7,8,9,10],
columnwidth = [55, 45, 45, 45, 45, 45, 45, 45, 45, 45],
header=dict(values=['<b>Settlement</b>',
'<b>Joint list TEST</b>', '<b>Joint list PRED</b>', '<b>JS log(MSE)</b>',
'<b>Likude TEST</b>', '<b>Likude PRED</b>', '<b>L log(MSE)</b>',
'<b>Blue white TEST</b>', '<b>Blue white PRED</b>', '<b>BW log(MSE)</b>'],
line_color='darkslategray', fill_color='rgb(158,202,225)',
font=dict(size=12), align='center'),
cells=dict(values=[ada_graph['Settlement'],
ada_graph['Joint list test'], ada_graph['Joint list pred'], ada_graph['JS_logmse'],
ada_graph['Likude test'], ada_graph['Likude pred'],ada_graph['L_logmse'],
ada_graph['Blue white test'], ada_graph['Blue white pred'], ada_graph['BW_logmse']],
line_color='darkslategray', fill_color = ['rgb(222,252,229)','white'],
align = 'center', font = dict(color = 'darkslategray', size = 12) ))
])
fig.update_layout(width=990, height=390, title='Actual votes vs. Predicted votes with log(MSE) per Settlement')
fig.show()
Our goal in this section is to answer the following question, in which settlements should the Election Commission place representatives in order to minimize the amount of disqualified votes? to answer the question as a classification problem, where a settlement with more that a threshold gets a True (danger of too much disqualified votes) or False means no danger. The used threshold is thershold = MAX(10% of highest dissqa)
dis_bycode = bycode.copy() #df3
dis_c21 = c21.copy()
dis_c22 = c22.copy()
dis_c23 = c23.copy()
cols_to_drop = ['שם יישוב', 'סמל ישוב', 'שם יישוב באנגלית','שם ישוב','סמל ישוב','נפה','בזב','כשרים', 'ועדת תכנון']
"""Data cleansing"""
dis_bycode = process_bycode(dis_bycode)
frames = [dis_c21, dis_c22]
df2 = df2 = pd.concat(frames, sort=False)
df2 = df2[df2.columns[0:7]]
df4 = df2.join(dis_bycode.set_index(['סמל יישוב'], verify_integrity=True),
on=['סמל ישוב'], how='inner')
df4 = df4.fillna(0)
"""Calculate Threshold"""
#Get Disqualified votes and calculate threshold.
disq = df4[['פסולים', 'סמל ישוב']]
#Avg between two elections.
disq = disq.groupby('סמל ישוב').mean()
disq = disq['פסולים']
disq = disq.sort_values(ascending=False)
# Top n% of the values
n = 5
thershold = np.round(disq.head(int(len(disq) * (n / 100))).iloc[-1])
print('threshold : ', thershold)
"""Train Data"""
df4['label'] = df4['פסולים'] > thershold
df4 = df4.drop(cols_to_drop,axis = 1 )
y_train = df4['label']
X_train = df4.drop(['label', 'פסולים'], axis=1)
"""Test Data"""
#From third elections build test x and y.
dis_c23 = dis_c23[dis_c23.columns[0:7]]
df41 = dis_c23.join(dis_bycode.set_index(['סמל יישוב'], verify_integrity=True),on=['סמל ישוב'], how='inner')
df41 = df41.fillna(0)
X_test = df41.drop(cols_to_drop,axis =1 )
X_test = X_test.drop('פסולים',axis=1)
y_test = df41['פסולים'] > thershold
The model will be trained on data of the first two elections
X_train
Before training each the model we're going to run feature selection and select the best features and save them in x_train based on the model. The pool of features we're choosing from is:
y_train
The target of the train is saved in y_train which is a boolean {True, False} representing if the settlements disqualified votes exceeded the threshold.
X_test
The x test data includes the same features we listed above for the third elections (also the features will be changed after using feature selection)
y_test
The target is like before, its a boolean that indicated weather the settlement's disqualified votes exceeds the threshold.
To ensure getting the best results we use a param grid and keep the best model.
# Hyperparameter grid
param_grid = {
'n_estimators': [100, 150, 200],#The number of trees in the forest.
'max_depth': [None, 20, 50] ,#The maximum depth of the tree.
'max_features': ['sqrt', None],#he number of features to consider when looking for the best split
'min_samples_split': [2, 5, 10],#The minimum number of samples required to split an internal node
'bootstrap': [True, False]#Whether bootstrap samples are used when building trees.
}
# choosign best features for the modle
sfm = SelectFromModel(RandomForestClassifier())
sfm.fit(X_train, y_train)
X_train = sfm.transform(X_train)
X_test = sfm.transform(X_test)
print('selected features :')
for ind in sfm.get_support(indices=True):
print(X.columns[ind])
# Create a grid search object
gsRFC = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, cv=5)
# Fit
gsRFC.fit(X_train, y_train)
gsRFC.best_params_
Run prediction and calculate model accuracy.
'''run prediction'''
best_model = gsRFC.best_estimator_
y_pred = best_model.predict(X_test)
print("Accuracy on validation:", metrics.accuracy_score(y_test, y_pred))
print('Set representatives in ' + str(sum(y_pred)) + ' settlements')
''' Confusion Matrix '''
classes = ['False', 'True']
df_cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (10, 10))
plt.imshow(df_cm, interpolation='nearest', cmap="Blues_r")
plt.title('Confusion Matrix', size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, size = 14)
plt.yticks(tick_marks, classes, size = 14)
thresh = df_cm.max() / 2.
for i in range(2):
for j in range(2):
plt.text(j,i, format(df_cm[i, j], 'd'), fontsize = 20,
horizontalalignment="center", color="black" if df_cm[i, j] > thresh else "white")
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
plt.show()
Our model suggests we set representatives in 52 settlements which is not a high number compared to the number of settlements we have which is 1214